library(tidyverse)

load_to_env <- function(fname, env = new.env()) {
    load(fname, env)
    return(env)
}

load("data/stable_topic_model.RData")
orig_out <- out
first_remapping <- order(colMeans(target_model$theta[which(orig_out$meta$publisher == "RT"), ]), decreasing = TRUE)
orig_theta <- target_model$theta[, first_remapping]

# TOPICS TO MERGE: 
# 34 - 57 - 87
# 58 - 69
assertthat::assert_that(ncol(orig_theta) == 89)
orig_theta <- cbind(orig_theta, (orig_theta[, 34] + orig_theta[, 57] + orig_theta[,87]), (orig_theta[, 58] + orig_theta[, 69]))
orig_theta <- orig_theta[, -c(34, 57, 87, 58, 69)]
second_remapping <- order(colMeans(orig_theta[which(orig_out$meta$publisher == "RT"), ]), decreasing = TRUE)
orig_theta <- orig_theta[, second_remapping]
assertthat::assert_that(ncol(orig_theta) == 86)
target_files <- list.files("data/extra_results_new_model", full.names = TRUE)

aggregate_topics <- function(theta, 
                             first = first_remapping, 
                             drop = c(34, 57, 87, 58, 69), 
                             second = second_remapping) {
    #assertthat::assert_that(ncol(theta) == 89)
    theta = theta[, first_remapping]
    theta = cbind(theta, (theta[, 34] + theta[, 57] + theta[,87]), (theta[, 58] + theta[, 69]))
    theta = theta[, -drop]
    theta = theta[, second_remapping]
    #assertthat::assert_that(ncol(theta) == 86)
    return(theta)
}

res <- list()
#target_files <- target_files[2:length(target_files)]
for(fname in target_files) {
    publisher <- str_extract(fname, "(?:.*)/extra_results_(.*)\\.RData", 1)
    print(publisher)
    e <- load_to_env(fname)
#    e$res$theta <- e$res$theta[, remapping]
    if(ncol(e$res$theta) != 89) {
        print(glue::glue("BAD DATA: {publisher}"))
        }
    e$res$theta <- aggregate_topics(e$res$theta)
    e$newdocs$meta$topic <- max.col(e$res$theta)
    e$newdocs$meta$topic_prob <- apply(e$res$theta, 1, max)
    #df <- as_tibble(e$newdocs$meta) |>
    res[[fname]] <- as_tibble(e$newdocs$meta) |>
        rename(topic_number = topic)
    
    theta_df <- as_tibble(e$res$theta, 
        .name_repair = ~str_c("prop_topic_", 1:ncol(e$res$theta)))
    #df <- bind_cols(df, theta_df)
    res[[fname]] <- bind_cols(res[[fname]], theta_df)
    #res <- bind_rows(res, df)
}

res <- bind_rows(res)

write_tsv(res, "data/extra_results.tsv")
arrow::write_parquet(res, "data/extra_results.parquet")